Explore called duplications

Author

Claudia Zirión-Martínez

Published

February 13, 2025

Index

Setup

Libraries

Code
library(tidyverse)
library(patchwork)
library(ggbeeswarm)
library(ggnewscale)
library(RColorBrewer)

Paths

Code
metadata_path <-
    "data/processed/metadata_ashton_desj_all_weavepop_H99.csv"
chromosomes_path <-
    "../Crypto_Desjardins_Ashton/results_clean/02.Dataset/chromosomes.csv"
chrom_lengths_path <-
    "data/processed/chromosome_lengths.tsv"
depth_by_chrom_good_desj_path <-
    "../Crypto_Desjardins/results_clean/04.Intermediate_files/02.Dataset/depth_quality/depth_by_chrom_good.tsv"
depth_by_chrom_good_ashton_path <-
    "../Crypto_Ashton/results_clean/04.Intermediate_files/02.Dataset/depth_quality/depth_by_chrom_good.tsv"
cnv_calls_path <-
    "../Crypto_Desjardins_Ashton/results_clean/02.Dataset/cnv/cnv_calls.tsv"
duplications_out_path <-
    "results/tables/duplications_putative.tsv"
repeats_path_prefix <-
    "../Crypto_Desjardins/results_clean/03.References/"

Prepare dataset

Metadata

Use the metadata table that has all the samples included in the final Crypto_Desjardins_Ashton dataset (n = 1055) and H99.
Select needed columns, remove H99 and get the number of samples per dataset and lineage, per lineage, and total.

Code
metadata <- read.delim(
    metadata_path,
    header=TRUE,
    sep=",",
    stringsAsFactor = TRUE)
metadata <- metadata %>%
    select(sample, strain, source, lineage, dataset, vni_subdivision)%>%
    filter(!strain == "H99") %>%
    group_by(dataset, lineage)%>%
    mutate(samples_in_dataset_lineage = n_distinct(sample))%>%
    ungroup() %>%
    group_by(lineage)%>%
    mutate(samples_in_lineage = n_distinct(sample))%>%
    ungroup()%>%
    mutate(total_samples = n_distinct(sample))%>%
    droplevels()

Chromosome names and length

Get the nice chromosome names from the chromosomes file in WeavePop.

Code
chromosome_names = read.delim(
    chromosomes_path,
    header=TRUE, sep=",")
chromosome_names <- chromosome_names %>%
    mutate(chromosome = str_pad(chromosome, 2, pad = "0"))%>%
    mutate(chromosome = as.factor(chromosome))
levels(chromosome_names$chromosome) <- paste("chr", chromosome_names$chromosome, sep="")

Get the chromosome lengths. Create the file with bash.

Code
# /usr/bin/bash
tail -n +2 ../Crypto_Desjardins/config/chromosomes.csv | \
    cut -d',' -f1 | sort | uniq | while read line 
do
seqkit fx2tab ../Crypto_Desjardins/data/references/$line.fasta -l -i -n >> data/processed/chromosome_lengths.tsv
done
Code
chromosome_lengths = read.delim(
    chrom_lengths_path,
    header=FALSE, 
    col.names=c("accession", "length"), 
    sep="\t")

Median depth of chromosomes

Code
depth_by_chrom_good_desjardins <- read.delim(
    depth_by_chrom_good_desj_path,
     header=TRUE, sep="\t")
depth_by_chrom_good_ashton <- read.delim(
    depth_by_chrom_good_ashton_path,
     header=TRUE, sep="\t")
depth_by_chrom_good <- rbind(depth_by_chrom_good_desjardins, depth_by_chrom_good_ashton)
depth_by_chrom <- depth_by_chrom_good %>%
    select(sample, accession, norm_chrom_median, norm_chrom_mean)
head(depth_by_chrom)
sample accession norm_chrom_median norm_chrom_mean
SRS404518 CP003820.1 0.98 0.97
SRS404518 CP003821.1 0.98 1.12
SRS404518 CP003822.1 0.99 1.01
SRS404518 CP003823.1 1.00 0.99
SRS404518 CP003824.1 0.99 0.98
SRS404518 CP003825.1 1.00 0.99

Called CNVs

Import the file with all called CNVs and keep only the duplications.

Code
cnv_calls <- read.delim(
    cnv_calls_path, 
    header=TRUE, sep="\t")
dup_calls <- filter(cnv_calls, cnv == "duplication") %>%
    left_join(chromosome_names, by="accession")
head(dup_calls)
accession start end cnv region_size depth norm_depth smooth_depth repeat_fraction overlap_bp feature_id sample lineage chromosome
chr02_Bt22 6001 10000 duplication 4000 491.62 8.62 2.90 0.21 832 SRS881178 VNBI chr02
chr05_Bt22 463501 475000 duplication 11500 282.04 4.95 4.95 0.12 1401 CNAG_00127,CNAG_00128,CNAG_01369,CNAG_07313 SRS881178 VNBI chr05
chr07_Bt22 606501 611000 duplication 4500 150.03 2.63 2.47 0.80 3622 CNAG_07666 SRS881178 VNBI chr07
chr12_Bt22 764001 775000 duplication 11000 95.71 1.68 1.67 0.00 0 CNAG_06985,CNAG_06986,CNAG_07011,CNAG_07918 SRS881178 VNBI chr12
chr07_Bt22 1423501 1427500 duplication 4000 172.81 2.58 1.64 0.86 3429 SRS885855 VNBI chr07
chr05_Bt22 464001 474500 duplication 10500 116.48 2.53 2.52 0.09 901 CNAG_00127,CNAG_00128,CNAG_01369,CNAG_07313 SRS885893 VNBI chr05

The Normalized depth (norm_depth) is the median of the normalized depth of the windows that are part of the CNV region.

Explore repeats

Get the percentage of each chromosome covered by repeats to know how much of a chromosome might not have reliable CNV calls.

Code
lineages <- unique(metadata$lineage)
repeats_all <- data.frame()
for(lineage in lineages){
    repeats_path <-paste(
        repeats_path_prefix,
        lineage, "/", lineage, "_repeats.bed",
        sep ="")
    repeats <- read.delim(repeats_path, 
        header=FALSE, 
        col.names=c("accession", "start", "end", "repeat_type"), 
        sep="\t")
    repeats$lineage <- lineage
    repeats_all <- rbind(repeats_all, repeats)
}
Code
repeats_percent <- repeats_all %>%
    mutate(repeat_size_each = end - start)%>%
    group_by(accession, lineage) %>%
    summarise(repeat_size = sum(repeat_size_each)) %>%
    left_join(chromosome_lengths, by="accession") %>%
    left_join(chromosome_names, by=c("accession","lineage")) %>%
    mutate(percent_repeat_size = round((repeat_size / length) * 100, 2))%>%
    mutate(chromosome = as.factor(chromosome))%>%
    select(lineage, accession, chromosome, percent_repeat_size)

Most chromosomes have repeats in between 5 and 10% of the size. In VNBII it is closer to 15% for some chormosomes.

Explore called duplications

Code
ggplot(dup_calls, aes(x = chromosome, y = norm_depth, color = repeat_fraction)) +
        geom_quasirandom()+
        theme_minimal() +
        theme(legend.position = "bottom")+
        labs(title = "Distribution of Normalized Depth of dCNVs",
        y = "Normalized Depth",
        x = "Chromosome")

To filter out the dCNVs that have very large Normalized Depth. Here I make groups of CNVs that start at the same position in the same chromosome.

Code
dup_summary <- dup_calls %>%
    group_by(start, end, accession, lineage) %>%
    summarize(max_depth = max(norm_depth),
              median_depth = median(norm_depth),
              n = n(),
              chromosome = first(chromosome),
              size = first(region_size))
Code
summary_large_dups <- dup_summary %>%
                        filter((max_depth > 5))%>%
                        arrange(desc(n))
summary_large_dups
start end accession lineage max_depth median_depth n chromosome size
339501 350500 CP003820.1 VNI 9.73 6.140 489 chr01 11000
274001 281500 CP003821.1 VNI 115.79 50.645 428 chr02 7500
274501 281000 CP003821.1 VNI 100.90 45.920 359 chr02 6500
582001 587000 CP003829.1 VNI 6.48 4.770 151 chr10 5000
1533001 1537000 CP003822.1 VNI 18.36 6.960 117 chr03 4000
1 2000 CP003820.1 VNI 32.38 5.410 108 chr01 2000
274501 281500 CP003821.1 VNI 74.82 52.900 47 chr02 7000
162501 166500 CP003831.1 VNI 5.22 3.470 45 chr12 4000
606501 611000 chr07_Bt22 VNBI 6.42 2.905 44 chr07 4500
1 2500 CP003820.1 VNI 40.55 8.510 24 chr01 2500
272501 283000 CP003821.1 VNI 61.92 43.860 13 chr02 10500
161501 167000 CP003831.1 VNI 5.37 4.250 9 chr12 5500
162001 167000 CP003831.1 VNI 8.27 4.380 9 chr12 5000
162501 167000 CP003831.1 VNI 7.63 4.435 8 chr12 4500
1156001 1160500 CP097934.1 VNBII 5.59 3.445 8 chr11 4500
895001 899500 chr11_Bt22 VNBI 6.03 4.690 7 chr11 4500
1430001 1437500 CP091251.1 VNII 8.04 7.620 4 chr05 7500
6001 10000 chr02_Bt22 VNBI 8.62 3.240 3 chr02 4000
881501 885500 CP097926.1 VNBII 5.18 3.410 3 chr03 4000
1430001 1437000 CP091251.1 VNII 8.53 4.890 3 chr05 7000
3501 10000 chr02_Bt22 VNBI 8.68 5.510 2 chr02 6500
4001 9500 chr02_Bt22 VNBI 6.29 4.340 2 chr02 5500
272001 283000 CP003821.1 VNI 31.76 30.695 2 chr02 11000
272501 282500 CP003821.1 VNI 41.14 37.875 2 chr02 10000
1156001 1161000 CP097934.1 VNBII 5.58 4.905 2 chr11 5000
1 3500 CP003820.1 VNI 6.89 6.890 1 chr01 3500
1501 6000 CP003828.1 VNI 5.51 5.510 1 chr09 4500
1501 6500 CP003828.1 VNI 5.26 5.260 1 chr09 5000
2001 7000 chr02_Bt22 VNBI 6.30 6.300 1 chr02 5000
6501 11000 CP003830.1 VNI 6.19 6.190 1 chr11 4500
274001 282000 CP003821.1 VNI 36.94 36.940 1 chr02 8000
286001 351000 CP003825.1 VNI 5.10 5.100 1 chr06 65000
339001 351500 CP003820.1 VNI 10.32 10.320 1 chr01 12500
339501 351000 CP003820.1 VNI 8.08 8.080 1 chr01 11500
339501 352500 CP003820.1 VNI 5.69 5.690 1 chr01 13000
339501 353000 CP003820.1 VNI 6.45 6.450 1 chr01 13500
339501 355500 CP003820.1 VNI 5.40 5.400 1 chr01 16000
1431001 1435500 CP091251.1 VNII 5.41 5.410 1 chr05 4500
1533001 1538000 CP003822.1 VNI 9.88 9.880 1 chr03 5000
Code
dup_calls_filtered <- dup_calls %>%
    filter(!paste(start, end, accession, lineage) %in% 
           paste(summary_large_dups$start, summary_large_dups$end, summary_large_dups$accession, summary_large_dups$lineage))
Code
ggplot(dup_calls_filtered, aes(x = chromosome, y = norm_depth, color = repeat_fraction)) +
        geom_quasirandom()+
        theme_minimal() +
        theme(legend.position = "bottom")+
        labs(title = "Distribution of Normalized Depth of dCNVs",
        y = "Normalized Depth",
        x = "Chromosome")

There is a duplication in many VNI samples in Chr02 with depth from 10 to 120X with a similar size and position, and with more than half of the CNV overlapping with repetitive sequences.

Filter out duplication of Chr02

Since that group in chr02 is the only with both depth higher than 10 and repeat fraction higher than 0.5, we use those thresholds to remove that group of CNVs.

Code
dup_calls_filtered <- dup_calls %>%
    filter(!(norm_depth > 10 & repeat_fraction > 0.5 & chromosome == "chr02"))
Code
ggplot(dup_calls_filtered, aes(x = chromosome, y = norm_depth, color = repeat_fraction)) +
        geom_quasirandom()+
        theme_minimal() +
        theme(legend.position = "bottom")+
        labs(title = "Distribution of Normalized Depth of dCNVs",
        y = "Normalized Depth",
        x = "Chromosome")

Explore repeat fraction, size and position of dCNVs of Chr01.

Code
r <- ggplot(filter(dup_calls, chromosome == "chr01"), aes(x = repeat_fraction, y = norm_depth)) +
        geom_point(aes(color = lineage)) +
        theme_minimal() +
        theme(legend.position = "none")+
        labs(title = "Repetitive Fraction vs. Depth",
            y = "Normalized Depth",
            x = "Fraction of CNV Overlaping with Repetitive Sequences")

s <- ggplot(filter(dup_calls, chromosome == "chr01"), aes(x = region_size, y = norm_depth)) +
        geom_point(aes(color = lineage)) +
        scale_x_continuous(labels = scales::comma) +
        theme_minimal() +
        theme(legend.position = "none")+
        labs(title = "Size of CNV vs. Depth",
            y = "Normalized Depth",
            x = "Size of CNV")

p <- ggplot(filter(dup_calls, chromosome == "chr01"), aes(x = start, y = norm_depth)) +
        geom_point(aes(color = lineage)) +
        scale_x_continuous(labels = scales::comma) +
        theme_minimal() +
        theme(legend.position = "bottom")+
        labs(title = "Depth Along Chromosomes",
            y = "Normalized Depth",
            x = "Position")
r/s/p

There is duplication in many VNI samples in Chr01 with a depth of up to 40X, it is not highly repetitive and it is in the same position.

Filter out duplication of Chr01 in VNI

Explore chr01 to know how to filter that out.

Print groups in chr01 with max depth higher than 4.

Code
dup_summary %>%
    filter(chromosome == "chr01", max_depth > 4)
start end accession lineage max_depth median_depth n chromosome size
1 2000 CP003820.1 VNI 32.38 5.41 108 chr01 2000
1 2500 CP003820.1 VNI 40.55 8.51 24 chr01 2500
1 3500 CP003820.1 VNI 6.89 6.89 1 chr01 3500
339001 351500 CP003820.1 VNI 10.32 10.32 1 chr01 12500
339501 350500 CP003820.1 VNI 9.73 6.14 489 chr01 11000
339501 351000 CP003820.1 VNI 8.08 8.08 1 chr01 11500
339501 352500 CP003820.1 VNI 5.69 5.69 1 chr01 13000
339501 353000 CP003820.1 VNI 6.45 6.45 1 chr01 13500
339501 355500 CP003820.1 VNI 5.40 5.40 1 chr01 16000
784001 786000 chr01_Bt22 VNBI 4.32 3.70 2 chr01 2000
1740501 1751500 chr01_Bt22 VNBI 4.26 3.21 6 chr01 11000

Print groups in chr01 that start between the positions 339000 and 340000.

Code
dup_summary %>%
    filter(chromosome == "chr01", start > 339000, start < 340000)
start end accession lineage max_depth median_depth n chromosome size
339001 351500 CP003820.1 VNI 10.32 10.32 1 chr01 12500
339501 346000 CP003820.1 VNI 2.14 1.96 9 chr01 6500
339501 347500 CP003820.1 VNI 1.94 1.94 1 chr01 8000
339501 350000 CP003820.1 VNI 2.22 2.06 3 chr01 10500
339501 350500 CP003820.1 VNI 9.73 6.14 489 chr01 11000
339501 351000 CP003820.1 VNI 8.08 8.08 1 chr01 11500
339501 352500 CP003820.1 VNI 5.69 5.69 1 chr01 13000
339501 353000 CP003820.1 VNI 6.45 6.45 1 chr01 13500
339501 355500 CP003820.1 VNI 5.40 5.40 1 chr01 16000

Filter out the CNVs in chr01 that start between the positions 337000 and 340000 and that start at position 1.

Code
dup_calls_filtered <- dup_calls_filtered %>%
    filter(!(chromosome == "chr01" & start > 339000 & start < 340000))%>%
    filter(!(chromosome == "chr01" & start == 1))
Code
ggplot(dup_calls_filtered, aes(x = chromosome, y = norm_depth, color = repeat_fraction)) +
        geom_quasirandom()+
        theme_minimal() +
        theme(legend.position = "bottom")+
        labs(title = "Distribution of Normalized Depth of dCNVs",
        y = "Normalized Depth",
        x = "Chromosome")

Explore repeat fraction, size and position of dCNVs of Chr03.

Code
r <- ggplot(filter(dup_calls, chromosome == "chr03"), aes(x = repeat_fraction, y = norm_depth)) +
        geom_point(aes(color = lineage)) +
        theme_minimal() +
        theme(legend.position = "none")+
        labs(title = "Repetitive Fraction vs. Depth",
            y = "Normalized Depth",
            x = "Fraction of CNV Overlaping with Repetitive Sequences")

s <- ggplot(filter(dup_calls, chromosome == "chr03"), aes(x = region_size, y = norm_depth)) +
        geom_point(aes(color = lineage)) +
        scale_x_continuous(labels = scales::comma) +
        theme_minimal() +
        theme(legend.position = "none")+
        labs(title = "Size of CNV vs. Depth",
            y = "Normalized Depth",
            x = "Size of CNV")

p <- ggplot(filter(dup_calls, chromosome == "chr03"), aes(x = start, y = norm_depth)) +
        geom_point(aes(color = lineage)) +
        scale_x_continuous(labels = scales::comma) +
        theme_minimal() +
        theme(legend.position = "bottom")+
        labs(title = "Depth Along Chromosomes",
            y = "Normalized Depth",
            x = "Position")
r/s/p

Explore chr03 to know how to filter that out.

Print groups in chr03 with max depth higher than 4.

Code
dup_summary %>%
    filter(chromosome == "chr03", max_depth > 4)
start end accession lineage max_depth median_depth n chromosome size
881501 885500 CP097926.1 VNBII 5.18 3.41 3 chr03 4000
1533001 1537000 CP003822.1 VNI 18.36 6.96 117 chr03 4000
1533001 1538000 CP003822.1 VNI 9.88 9.88 1 chr03 5000

Filter out the CNVs in chr03 that start at positions 1533001.

Code
dup_calls_filtered <- dup_calls_filtered %>%
    filter(!(chromosome == "chr03" & start == 1533001))
Code
ggplot(dup_calls_filtered, aes(x = chromosome, y = norm_depth, color = repeat_fraction)) +
        geom_quasirandom()+
        theme_minimal() +
        theme(legend.position = "bottom")+
        labs(title = "Distribution of Normalized Depth of dCNVs",
        y = "Normalized Depth",
        x = "Chromosome")

Join metrics from CNVs and depth of chromosome

The dataframe all_metrics contains a row per individual called duplication with the information about the CNV and the chromosome of the sample. For the chromosome-sample without any called duplication, the columns related to CNVs have NAs.

Code
all_metrics <- left_join(depth_by_chrom, dup_calls, #or dup_calls_filtered
        by = c("sample", "accession"))%>%
    select(-chromosome, -lineage)%>%
    left_join(chromosome_lengths, by = "accession") %>%
    left_join(chromosome_names,by ="accession") %>%
    left_join(metadata, by = c("sample", "lineage"))
head(all_metrics)
sample accession norm_chrom_median norm_chrom_mean start end cnv region_size depth norm_depth smooth_depth repeat_fraction overlap_bp feature_id length lineage chromosome strain source dataset vni_subdivision samples_in_dataset_lineage samples_in_lineage total_samples
SRS404518 CP003820.1 0.98 0.97 NA NA NA NA NA NA NA NA NA NA 2291499 VNI chr01 Bt134 Clinical Desjardins VNIa-5 185 853 1055
SRS404518 CP003821.1 0.98 1.12 274501 281000 duplication 6500 4049.51 37.85 37.32 0.57 3735 1621675 VNI chr02 Bt134 Clinical Desjardins VNIa-5 185 853 1055
SRS404518 CP003822.1 0.99 1.01 1533001 1537000 duplication 4000 1246.32 11.64 5.32 0.08 312 CNAG_06925,CNAG_06926 1575141 VNI chr03 Bt134 Clinical Desjardins VNIa-5 185 853 1055
SRS404518 CP003823.1 1.00 0.99 NA NA NA NA NA NA NA NA NA NA 1084805 VNI chr04 Bt134 Clinical Desjardins VNIa-5 185 853 1055
SRS404518 CP003824.1 0.99 0.98 NA NA NA NA NA NA NA NA NA NA 1814975 VNI chr05 Bt134 Clinical Desjardins VNIa-5 185 853 1055
SRS404518 CP003825.1 1.00 0.99 NA NA NA NA NA NA NA NA NA NA 1422463 VNI chr06 Bt134 Clinical Desjardins VNIa-5 185 853 1055

Summarize the metrics of the called CNVs per chromosome. And keep the rest of the chromosome metrics.

Code
chrom_metrics <- all_metrics %>%
    group_by(sample, strain, chromosome, norm_chrom_mean, norm_chrom_median, length,
            accession,  source, lineage,
            samples_in_lineage, samples_in_dataset_lineage, 
            total_samples,dataset) %>%
    summarise(total_cnv_size = sum(region_size),
                n_cnvs = n(),
                first = min(start),
                last = max(end),
                mean_cnv_depth = round(mean(norm_depth),2),
                median_cnv_depth = round(median(norm_depth),2),
                repeat_size = sum(overlap_bp)) %>%
            ungroup()%>%
    mutate(dup_coverage_percent = round((total_cnv_size / length) * 100, 2),
            dup_span_size = last - first,
            dup_span_percent = round((dup_span_size / length) * 100, 2),
            dup_repeat_percent = round((repeat_size / length) * 100, 2),
            chromosome = as.factor(chromosome),
            dup_coverage_percent = ifelse(is.na(dup_coverage_percent), 0, dup_coverage_percent),
            dup_span_percent = ifelse(is.na(dup_span_percent), 0, dup_span_percent))
head(chrom_metrics)
sample strain chromosome norm_chrom_mean norm_chrom_median length accession source lineage samples_in_lineage samples_in_dataset_lineage total_samples dataset total_cnv_size n_cnvs first last mean_cnv_depth median_cnv_depth repeat_size dup_coverage_percent dup_span_size dup_span_percent dup_repeat_percent
ERS1142690 20949_2#1 chr01 1.00 0.98 2291499 CP003820.1 Clinical VNI 853 668 1055 Ashton 11000 1 339501 350500 7.12 7.12 0 0.48 10999 0.48 0.00
ERS1142690 20949_2#1 chr02 1.20 0.96 1621675 CP003821.1 Clinical VNI 853 668 1055 Ashton 6500 1 274501 281000 64.92 64.92 3735 0.40 6499 0.40 0.23
ERS1142690 20949_2#1 chr03 0.99 0.98 1575141 CP003822.1 Clinical VNI 853 668 1055 Ashton NA 1 NA NA NA NA NA 0.00 NA 0.00 NA
ERS1142690 20949_2#1 chr04 1.00 1.00 1084805 CP003823.1 Clinical VNI 853 668 1055 Ashton NA 1 NA NA NA NA NA 0.00 NA 0.00 NA
ERS1142690 20949_2#1 chr05 0.97 0.98 1814975 CP003824.1 Clinical VNI 853 668 1055 Ashton NA 1 NA NA NA NA NA 0.00 NA 0.00 NA
ERS1142690 20949_2#1 chr06 1.00 1.00 1422463 CP003825.1 Clinical VNI 853 668 1055 Ashton NA 1 NA NA NA NA NA 0.00 NA 0.00 NA
The total number of chromosomes in all samples in the dataset is: 14770
The total number of chromosomes in all samples with called duplications is: 5058

Explore relationship between metrics

Code
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = dup_span_percent)) +
        geom_point(aes(color = n_cnvs)) +
        scale_x_continuous(labels = scales::comma) +
        scale_color_distiller(palette = "GnBu", direction = -1, trans = "log2", name = "Number\n of CNVs") +
        theme_minimal() +
        theme(legend.position = "right")+
        labs(title = "Percent of Chromosome Covered by CNVs vs Spanned by CNVs",
            y = "Percent of Chromosome Spanned by CNVs",
            x = "Percent of Chromosome Covered by CNVs")

Code
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = dup_span_percent)) +
        geom_point(aes(color = n_cnvs)) +
        facet_wrap(~chromosome, ncol = 3)+
        scale_x_continuous(labels = scales::comma) +
        scale_color_distiller(palette = "GnBu", direction = -1, trans = "log2", name = "Number\n of CNVs") +
        theme_minimal() +
        theme(legend.position = "right")+
        labs(title = "Percent of Chromosome Covered by CNVs vs Spanned by CNVs",
            y = "Percent of Chromosome Spanned by CNVs",
            x = "Percent of Chromosome Covered by CNVs")

Code
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = dup_span_percent)) +
        geom_point(aes(color = dup_repeat_percent)) +
        scale_x_continuous(labels = scales::comma) +
        scale_color_distiller(palette = "YlOrRd", direction = 1, trans = "log2", name = "Percent of CNV\nOverlaped by Repeats") +
        theme_minimal() +
        theme(legend.position = "right")+
        labs(title = "Percent of Chromosome Covered by CNVs vs Spanned by CNVs",
            y = "Percent of Chromosome Spanned by CNVs",
            x = "Percent of Chromosome Covered by CNVs")
Warning in scale_color_distiller(palette = "YlOrRd", direction = 1, trans =
"log2", : log-2 transformation introduced infinite values.

Code
ggplot(chrom_metrics)+
    geom_histogram(aes(dup_coverage_percent), binwidth = 1)+
    scale_x_continuous(breaks = seq(0,100, by = 5)) +
    theme_minimal() +
    labs(title = "Histogram of Percent of Chromosome Covered by CNVs",
            y = "Count",
            x = "Percent of Chromosome Covered by CNVs")

Code
ggplot(filter(chrom_metrics, dup_coverage_percent > 0))+
    geom_histogram(aes(dup_coverage_percent), binwidth = 1)+
    scale_x_continuous(breaks = seq(0,100, by = 5)) +
    theme_minimal() +
    labs(title = "Histogram of Percent of Chromosome Covered by CNVs",
            subtitle = "Only values above 0%",
            y = "Count",
            x = "Percent of Chromosome Covered by CNVs")

Code
ggplot(filter(chrom_metrics, dup_coverage_percent > 15))+
    geom_boxplot(aes(x = chromosome, y = dup_coverage_percent))+
    geom_quasirandom(aes(x = chromosome, y = dup_coverage_percent))+
    theme_minimal() +
    labs(title = "Boxplot of Percent of Chromosome Covered by CNVs per Chromosome",
            subtitle = "Only values above 15%",
            y = "Count",
            x = "Percent of Chromosome Covered by CNVs")

Code
ggplot(filter(chrom_metrics, dup_span_percent > 0))+
    geom_histogram(aes(dup_span_percent), binwidth = 1)+
    scale_x_continuous(breaks = seq(0,100, by = 5)) +
    theme_minimal() +
    labs(title = "Histogram of Percent of Chromosome Spanned by CNVs",
            subtitle = "Only values above 0%",
            y = "Count",
            x = "Percent of Chromosome Spanned by CNVs")

Code
ggplot(chrom_metrics)+
    geom_histogram(aes(norm_chrom_median), binwidth = 0.01)+
    scale_x_continuous(breaks = seq(0,max(chrom_metrics$norm_chrom_median), by = 0.1)) +
    theme_minimal() +
    labs(title = "Histogram of Normalized median depth of chromosomes",
            y = "Count",
            x = "Normalized median depth of chromosomes")

Code
ggplot(filter(chrom_metrics, norm_chrom_median > 1))+
    geom_histogram(aes(norm_chrom_median), binwidth = 0.01)+
    scale_x_continuous(breaks = seq(0,max(chrom_metrics$norm_chrom_median), by = 0.1)) +
    theme_minimal() +
    labs(title = "Histogram of Normalized median depth of chromosomes",
            subtitle = "Only values above 1",
            y = "Count",
            x = "Normalized median depth of chromosomes")

Code
ggplot(filter(chrom_metrics, norm_chrom_median > 1.2))+
    geom_histogram(aes(norm_chrom_median), binwidth = 0.01)+
    scale_x_continuous(breaks = seq(0,max(chrom_metrics$norm_chrom_median), by = 0.1)) +
    theme_minimal() +
    labs(title = "Histogram of Normalized median depth of chromosomes",
            subtitle = "Only values above 1.2",
            y = "Count",
            x = "Normalized median depth of chromosomes")

Code
ggplot(chrom_metrics)+
    geom_histogram(aes(median_cnv_depth), binwidth = 0.01)+
    scale_x_continuous(breaks = seq(0, max(chrom_metrics$median_cnv_depth, na.rm = TRUE), by = 0.5)) +
    theme_minimal() +
    labs(title = "Histogram of Median Depth of CNVs",
            y = "Count",
            x = "Median Depth of CNVs")
Warning: Removed 9712 rows containing non-finite outside the scale range
(`stat_bin()`).

Compare metrics of CNVs with chromosome depth

The Normalized depth (norm_depth) is the median of the normalized depth of the windows that are part of each CNV region.

The Median depth of CNV (median_cnv_depth) is the median of the Normalized depth of all CNVs in the chromosome. There are values bellow the threshold of depth to call a Duplication CNV because that threshold was applied to the smoothed values and this are the raw values.

The Normalized median depth of chromosomes (norm_chrom_median) is the normalized median of the depth of the positions in the whole chromosome. (First the median was calculated and then normalized).

The Percent of Chromosome Covered by CNVs (dup_coverage_percent) is the percentage of the full length of the chromosome that is part of called CNVs.

The Percent of Chromosome Spanned by CNVs (dup_span_percent) is the percentage of the full length of the chromosome that is in between the leftmost and rightmost CNVs.

Code
color_range <- paste("2.6 -", max(chrom_metrics$median_cnv_depth, na.rm = TRUE))
colors <- c("#FDE725", "#1F9E89", "#440154")
names(colors) <-c("0 - 1.6", "1.6 - 2.6", color_range)
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = norm_chrom_median)) +
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
        geom_point(aes(color = cut(median_cnv_depth, 
                            breaks = c(-Inf, 1.6, 2.6, Inf), 
                            labels = c("0 - 1.6", "1.6 - 2.6", color_range))),
                            alpha = 0.5) +
        scale_color_manual(values = colors, 
                            name = "Median CNV Depth") +
        theme_minimal() +
        theme(legend.position = "right") +
        labs(title = "Percent of Chromosome Covered by CNVs vs. Depth of Chromosome",
             y = "Normalized Median Depth of Chromosome",
             x = "Percent of Chromosome Covered by CNVs")

Code
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = norm_chrom_median)) +
        facet_wrap(~chromosome, ncol = 4)+
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
        geom_point(aes(color = cut(median_cnv_depth, 
                            breaks = c(-Inf, 1.6, 2.6, Inf), 
                            labels = c("0 - 1.6", "1.6 - 2.6", color_range))),
                            alpha = 0.5) +
        scale_color_manual(values = colors, 
                            name = "Median CNV Depth") +
        theme_minimal() +
        theme(legend.position = "right") +
        labs(title = "Percent of Chromosome Covered by CNVs vs. Depth of Chromosome",
             y = "Normalized Median Depth of Chromosome",
             x = "Percent of Chromosome Covered by CNVs")

Explore putative duplicated chromosomes

Filter by percent of chromosome in CNV or chromosome depth

size_threshold <- 50
depht_threshold <- 1.55
Code
chrom_metrics_filtered <- chrom_metrics %>%
    filter(dup_coverage_percent >= size_threshold | norm_chrom_median >= 1.55)
Code
ggplot(chrom_metrics_filtered, aes(x = dup_coverage_percent, y = norm_chrom_median)) +
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
        geom_point(aes(color = cut(median_cnv_depth, 
                            breaks = c(-Inf, 1.6, 2.6, Inf), 
                            labels = c("0 - 1.6", "1.6 - 2.6", color_range))),
                            alpha = 0.5) +
        scale_color_manual(values = colors, 
                            name = "Median CNV Depth") +
        theme_minimal() +
        theme(legend.position = "right") +
        labs(title = "Percent of Chromosome Covered by CNVs vs. Depth of Chromosome",
             y = "Normalized Median Depth of Chromosome",
             x = "Percent of Chromosome Covered by CNVs")

Code
ggplot(chrom_metrics_filtered, aes(x = dup_coverage_percent, y = norm_chrom_median)) +
        facet_wrap(~chromosome, ncol = 3)+

        geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
        geom_point(aes(color = cut(median_cnv_depth, 
                            breaks = c(-Inf, 1.6, 2.6, Inf), 
                            labels = c("0 - 1.6", "1.6 - 2.6", color_range))),
                            alpha = 0.5) +
        scale_color_manual(values = colors, 
                            name = "Median CNV Depth") +
        theme_minimal() +
        theme(legend.position = "right") +
        labs(title = "Percent of Chromosome Covered by CNVs vs. Depth of Chromosome",
             y = "Normalized Median Depth of Chromosome",
             x = "Percent of Chromosome Covered by CNVs")

Code
ggplot(chrom_metrics_filtered, aes(x = dup_coverage_percent, y = norm_chrom_median)) +
        geom_point(aes(color = n_cnvs)) +
        facet_wrap(~chromosome, ncol = 3)+
        scale_x_continuous(labels = scales::comma) +
        scale_color_distiller(palette = "GnBu", direction = -1, trans = "log2", name = "Number\n of CNVs") +
        theme_minimal() +
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
        labs(title = "Percent of Chromosome Covered by CNVs vs. Depth of Chromosome",
            y = "Normalized Median Depth of Chromosome",
            x = "Percent of Chromosome Covered by CNVs")+
        theme(panel.grid.major.x = element_blank(),
            panel.grid.minor.x = element_blank(),
            panel.grid.major.y = element_line(color = "lightgray"),
            panel.grid.minor.y = element_blank(),
            panel.background = element_rect(fill = "white"),
            axis.text.x = element_text(angle = 45, hjust = 1),
            panel.border = element_rect(colour = "lightgray", fill=NA, linewidth = 2),
            legend.position = "right")

Code
ggplot(chrom_metrics_filtered, aes(x = dup_coverage_percent, y = norm_chrom_median)) +
        geom_point(aes(color = dup_span_percent)) +
        facet_wrap(~chromosome, ncol = 3)+
        scale_x_continuous(labels = scales::comma) +
        scale_color_distiller(palette = "BuPu", direction = 1, name = "Percent Spanned\nby CNVs") +
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
        labs(title = "Percent of Chromosome Covered by CNVs vs. Depth of Chromosome",
            y = "Normalized Median Depth of Chromosome",
            x = "Percent of Chromosome Covered by CNVs")+
        theme_minimal()+
        theme(panel.grid.major.x = element_blank(),
            panel.grid.minor.x = element_blank(),
            panel.grid.major.y = element_line(color = "lightgray"),
            panel.grid.minor.y = element_blank(),
            panel.background = element_rect(fill = "white"),
            axis.text.x = element_text(angle = 45, hjust = 1),
            panel.border = element_rect(colour = "lightgray", fill=NA, linewidth = 2),
            legend.position = "right")

Code
write_tsv(chrom_metrics_filtered, duplications_out_path)

Explore sample with very large Median Depth of Chromosome

Code
chrom_metrics_filtered %>%
    filter(norm_chrom_median > 2.5)%>%
    select(sample, strain, chromosome, norm_chrom_median, dup_coverage_percent)
sample strain chromosome norm_chrom_median dup_coverage_percent
ERS542397 14936_1#6 chr04 2.74 64.94